Quality of station data

We want to look at a group of station data and test a strategy for assessing data quality. This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

Station data

Get some data to play with.

library(esd)
## Loading required package: ncdf4
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'esd'
## The following object is masked from 'package:base':
## 
##     subset.matrix
library(utils)
if (!file.exists('precip.norway.rda')) download.file('https://ndownloader.figshare.com/files/2155751','precip.norway.rda')
load('precip.norway.rda')
## The precipitation data is 'Pr'

Now inspect the data - check for missing values

diagnose(Pr)

Estimate annual aggregates:

mu <- annual(Pr,'wetmean')
fw <- annual(Pr,'wetfreq')
map(mu,FUN='trend',new=FALSE)

Plot the original daily data

plot(Pr,new=FALSE)

The wet-day mean precipitation \(\mu\)

plot(mu,new=FALSE)

The wet-day frequency \(f_w\)

plot(fw,new=FALSE)

Apply a principal component analysis (PCA) to the annual aggregates:

pca.mu <- PCA(mu)
pca.fw <- PCA(fw)

Synthesise errors

Take a copy of the original data and mess it up by introducing deliberate errors and artifacts to degrade the information embedded. The idea is to explore the ways such errors affect the end result.

X <- Pr
## Degrade the original data by synthetically introducing of errors and problems
d <- dim(X)
## Indices with values set to zero
s20.1 <- sample(1:d[1],1000); s20.2 <- sample(1:d[2],10)
## Indices with values set to random
s2r.1 <- sample(1:d[1],1000); s2r.2 <- sample(1:d[2],10)
## Set a random sub-selection of data points to zero or random values
X[s20.1,s20.2] <- 0
X[s2r.1,s2r.2] <- rexp(10000,rate=0.001)
## Set some locations to bad values
S2R <- sample(1:d[2],3)
X[,S2R] <- rexp(d[1],rate=c(1,0.1,0.5))

Estimate synthetic annual aggregates for \(\mu\) and \(f_w\).

mu.s <- annual(X,'wetmean')
fw.s <- annual(X,'wetfreq')
map(mu,FUN='trend',new=FALSE)

Plot the degraded data:

plot(X,new=FALSE)

The aggregated degraded data: \(\mu\)

plot(mu.s,new=FALSE)

and \(f_w\)

plot(fw.s,new=FALSE)

analysis of the original and degraded data

Apply PCA to the degraded data:

pca.smu <- PCA(mu.s)
pca.sfw <- PCA(fw.s)

Plot the leading PCA mode of the original \(\mu\)

plot(pca.mu,new=FALSE)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "plot" is not a
## graphical parameter

Plot the leading PCA mode of the degraded \(\mu\)

plot(pca.smu,new=FALSE)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "plot" is not a
## graphical parameter

Plot the leading PCA mode of the original \(\mu\)

plot(pca.fw,new=FALSE)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "plot" is not a
## graphical parameter

Plot the leading PCA mode of the degraded \(\mu\)

plot(pca.sfw,new=FALSE)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "plot" is not a
## graphical parameter

Second information source

Retrieve mean sea-level pressure (SLP) and use a regression analysis to examine the data, knowing that the SLP is related to \(f_w\).

#slp <- annual(retrieve('slp.mon.mean.nc',lon=c(-60,30),lat=c(50,70)))
#eof.slp<- EOF(slp)
#save(file='eof.slp.rda',eof.slp)
load('eof.slp.rda')
ds <- DS(subset(pca.fw,pattern=1:3),eof.slp)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |======================                                           |  33%
## Warning in DS.station(ys, X, biascorrect = biascorrect, m = m, eofs =
## eofs, : DS.station: different indices: Date numeric
## 
  |                                                                       
  |===========================================                      |  67%
## Warning in DS.station(ys, X, biascorrect = biascorrect, m = m, eofs =
## eofs, : DS.station: different indices: Date numeric
## 
  |                                                                       
  |=================================================================| 100%
## Warning in DS.station(ys, X, biascorrect = biascorrect, m = m, eofs =
## eofs, : DS.station: different indices: Date numeric
ds.s <- DS(subset(pca.sfw,pattern=1:3),eof.slp)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |======================                                           |  33%
## Warning in DS.station(ys, X, biascorrect = biascorrect, m = m, eofs =
## eofs, : DS.station: different indices: Date numeric
## 
  |                                                                       
  |===========================================                      |  67%
## Warning in DS.station(ys, X, biascorrect = biascorrect, m = m, eofs =
## eofs, : DS.station: different indices: Date numeric
## 
  |                                                                       
  |=================================================================| 100%
## Warning in DS.station(ys, X, biascorrect = biascorrect, m = m, eofs =
## eofs, : DS.station: different indices: Date numeric

Compare the results:

plot(ds,new=FALSE)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "plot" is not a
## graphical parameter

## NULL
plot(ds.s,new=FALSE)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "plot" is not a
## graphical parameter

## NULL